Source of the data https://data.world/vizzup/mental-health-depression-disorder-data
## importing libraries
library(readxl)
library(plotly)
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggmap)
library(maps)
library(corrplot)
library(lmtest)
library(car)
## Importing the mental health data set
mental_health <- read_excel("C:/Users/Marti/OneDrive/Desktop/R_Project_2023/mental_health_data.xlsx")
head(mental_health)
## # A tibble: 6 x 10
## Entity Code Year `Schizophrenia (%)` `Bipolar disorder (%)`
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan AFG 1990 0.161 0.698
## 2 Afghanistan AFG 1991 0.160 0.698
## 3 Afghanistan AFG 1992 0.160 0.698
## 4 Afghanistan AFG 1993 0.160 0.698
## 5 Afghanistan AFG 1994 0.160 0.698
## 6 Afghanistan AFG 1995 0.160 0.699
## # i 5 more variables: `Eating disorders (%)` <dbl>,
## # `Anxiety disorders (%)` <dbl>, `Drug use disorders (%)` <dbl>,
## # `Depression (%)` <dbl>, `Alcohol use disorders (%)` <dbl>
str(mental_health)
## tibble [6,468 x 10] (S3: tbl_df/tbl/data.frame)
## $ Entity : chr [1:6468] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Code : chr [1:6468] "AFG" "AFG" "AFG" "AFG" ...
## $ Year : num [1:6468] 1990 1991 1992 1993 1994 ...
## $ Schizophrenia (%) : num [1:6468] 0.161 0.16 0.16 0.16 0.16 ...
## $ Bipolar disorder (%) : num [1:6468] 0.698 0.698 0.698 0.698 0.698 ...
## $ Eating disorders (%) : num [1:6468] 0.1019 0.0993 0.0967 0.0943 0.0924 ...
## $ Anxiety disorders (%) : num [1:6468] 4.83 4.83 4.83 4.83 4.83 ...
## $ Drug use disorders (%) : num [1:6468] 1.68 1.68 1.69 1.71 1.72 ...
## $ Depression (%) : num [1:6468] 4.07 4.08 4.09 4.1 4.1 ...
## $ Alcohol use disorders (%): num [1:6468] 0.672 0.672 0.671 0.67 0.669 ...
#summary(mental_health)
There are 10 columns in the data set.
colnames(mental_health)
## [1] "Entity" "Code"
## [3] "Year" "Schizophrenia (%)"
## [5] "Bipolar disorder (%)" "Eating disorders (%)"
## [7] "Anxiety disorders (%)" "Drug use disorders (%)"
## [9] "Depression (%)" "Alcohol use disorders (%)"
# Prevalence distribution for each mental disorder
mental_health %>%
gather(key = "Disorder", value = "Prevalence", -Entity, -Year) %>%
mutate(Prevalence = as.numeric(Prevalence)) %>%
ggplot(aes(x = Prevalence)) +
geom_histogram(binwidth = 0.5) +
facet_wrap(~Disorder, scales = "free_x") +
xlab("Prevalence (%)") +
ylab("Count") +
theme_minimal()
## Warning: There was 1 warning in `mutate()`.
## i In argument: `Prevalence = as.numeric(Prevalence)`.
## Caused by warning:
## ! pojawiły się wartości NA na skutek przekształcenia
## Warning: Removed 6468 rows containing non-finite values (`stat_bin()`).
mean(rowSums(is.na(mental_health)) > 0) * 100
## [1] 15.15152
mental_health %>%
gather(Disorder, Prevalence, -Entity, -Year) %>%
mutate(Missing = is.na(Prevalence)) %>%
group_by(Disorder, Missing) %>%
summarise(Count = n()) %>%
ggplot(aes(x=Disorder, y=Count, fill=Missing)) +
geom_bar(stat='identity') +
ylab("Count") +
theme_minimal()
## `summarise()` has grouped output by 'Disorder'. You can override using the
## `.groups` argument.
There are approximately 15% of rows with missing data - will be removed.
mental_health_clean <- mental_health[complete.cases(mental_health), ]
# Tables with descriptive statistics for each disorder
mental_health %>%
summarise_at(vars(-Entity, -Year), funs(mean(., na.rm = TRUE), sd(., na.rm = TRUE), sum(is.na(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## i Please use a list of either functions or lambdas:
##
## # Simple named list: list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
##
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## i In argument: `Code_mean = mean(Code, na.rm = TRUE)`.
## Caused by warning in `mean.default()`:
## ! argument nie jest wartością liczbową ani logiczną: zwracanie wartości NA
## i Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 1 remaining warning.
## # A tibble: 1 x 24
## Code_mean Schizophrenia (%)_me~1 Bipolar disorder (%)~2 Eating disorders (%)~3
## <dbl> <dbl> <dbl> <dbl>
## 1 NA 0.212 0.719 0.240
## # i abbreviated names: 1: `Schizophrenia (%)_mean`,
## # 2: `Bipolar disorder (%)_mean`, 3: `Eating disorders (%)_mean`
## # i 20 more variables: `Anxiety disorders (%)_mean` <dbl>,
## # `Drug use disorders (%)_mean` <dbl>, `Depression (%)_mean` <dbl>,
## # `Alcohol use disorders (%)_mean` <dbl>, Code_sd <dbl>,
## # `Schizophrenia (%)_sd` <dbl>, `Bipolar disorder (%)_sd` <dbl>,
## # `Eating disorders (%)_sd` <dbl>, `Anxiety disorders (%)_sd` <dbl>, ...
correlation_matrix <- cor(mental_health[, c("Schizophrenia (%)", "Bipolar disorder (%)", "Eating disorders (%)","Anxiety disorders (%)", "Drug use disorders (%)", "Depression (%)","Alcohol use disorders (%)")], use = "complete.obs")
# Select only numeric columns
numeric_data <- mental_health %>%
select(where(is.numeric))
# Compute correlation matrix
correlation_matrix <- cor(numeric_data, use = "pairwise.complete.obs")
# Plot correlation matrix
corrplot(correlation_matrix, method = "circle")
plot <- plot_ly(
z = correlation_matrix,
x = colnames(correlation_matrix),
y = colnames(correlation_matrix),
type = "heatmap",
colorscale = "RdYlBu"
)
plot <- plot %>%
layout(
title = "Correlation of mental disorders",
xaxis = list(title = ""),
yaxis = list(title = "")
)
plot
# Regression model to investigate how the prevalence of one disorder is associated with another
model <- lm(`Depression (%)` ~ `Anxiety disorders (%)`, data = mental_health_clean)
summary(model)
##
## Call:
## lm(formula = `Depression (%)` ~ `Anxiety disorders (%)`, data = mental_health_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.27593 -0.42309 -0.05742 0.36680 2.78405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.675964 0.030845 86.75 <2e-16 ***
## `Anxiety disorders (%)` 0.202317 0.007511 26.94 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6313 on 5486 degrees of freedom
## Multiple R-squared: 0.1168, Adjusted R-squared: 0.1167
## F-statistic: 725.6 on 1 and 5486 DF, p-value: < 2.2e-16
# Relationship between Depression and Anxiety Disorders
ggplot(mental_health_clean, aes(x=`Anxiety disorders (%)`, y=`Depression (%)`)) +
geom_point(color = "deepskyblue") +
geom_smooth(method="lm", col="darkviolet") +
xlab("Anxiety Disorders (%)") +
ylab("Depression (%)") +
theme_minimal() +
labs(title = "Relationship between Depression and Anxiety Disorders")
## `geom_smooth()` using formula = 'y ~ x'
# Relationship between Bipolar and Eating disorders
ggplot(mental_health_clean, aes(x=`Bipolar disorder (%)`, y=`Eating disorders (%)`)) +
geom_point(color = "deepskyblue") +
geom_smooth(method="lm", col="darkviolet") +
xlab("Bipolar disorder (%)") +
ylab("Eating disorders (%)") +
theme_minimal() +
labs(title = "Relationship between Bipolar and Eating disorders")
## `geom_smooth()` using formula = 'y ~ x'
depression_by_year <- aggregate(mental_health_clean$`Depression (%)`, by = list(Year = mental_health_clean$Year), FUN = function(x) c(mean = mean(x), median = median(x)))
plot <- plot_ly(x = depression_by_year$Year, y = depression_by_year$x[,"mean"], type = "scatter", mode = "lines", name = 'Mean') %>%
add_trace(y = depression_by_year$x[,"median"], name = 'Median') %>%
layout(title = "Development of depresion over the years",
xaxis = list(title = "Year"),
yaxis = list(title = "Depression level in %"))
plot
alcohol_disorder_2017 = subset(mental_health_clean, Year == 2017)
# Alcohol Use Disorder by Country in 2017
plot_alcohol <- plot_ly(
data = alcohol_disorder_2017,
type = 'choropleth',
locations = alcohol_disorder_2017$Code,
z = alcohol_disorder_2017$`Alcohol use disorders (%)`,
text = alcohol_disorder_2017$Entity,
colorscale = "RdPu") %>%
layout(title = "Alcohol Use Disorder by Country in 2017")
plot_alcohol
depression_2017 = subset(mental_health_clean, Year == 2017)
# Depression by Country in 2017
plot_depression <- plot_ly(
data = depression_2017,
type = 'choropleth',
locations = depression_2017$Code,
z = depression_2017$`Depression (%)`,
text = depression_2017$Entity,
colorscale = "RdPu") %>%
layout(title = "Depression by Country in 2017")
plot_depression